#Loading the required packages
library(tidyverse)
library(dplyr)
library(Rmisc)
library(Hmisc)
library(emmeans)
library(grid)
library(gridExtra)
library(knitr)
library(car)
options(width=100)
| Variable | Description |
|---|---|
| student_ID | The unique IDs for each student at school |
| tutoring | Shows which student received tutoring (TRUE = Tutored, FALSE = Not tutored) |
| absences | The proportions(%) of classes missed by each student |
| score.t1 | The test score at the beginning of the academic year for each student (before implementing buddying scheme) |
| score.t2 | The test score at the end of the academic year for each student (after implementing buddying scheme) |
#Reading the file into R
tutoring_data <- read_csv("tutoring_test_data.txt")
#Checking the structures of data to check for necessary amending
str(tutoring_data)
## spec_tbl_df [202 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ student_ID: num [1:202] 56 40 63 108 72 10 15 42 152 104 ...
## $ tutoring : logi [1:202] TRUE TRUE TRUE FALSE TRUE TRUE ...
## $ absences : num [1:202] 3.6 2.4 3.6 4.8 4.8 ...
## $ score.t1 : num [1:202] 70.4 67.8 40.2 75.3 31 ...
## $ score.t2 : num [1:202] 75.2 75.9 45.5 79.7 31.7 ...
## - attr(*, "spec")=
## .. cols(
## .. student_ID = col_double(),
## .. tutoring = col_logical(),
## .. absences = col_double(),
## .. score.t1 = col_double(),
## .. score.t2 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
The datatype for tutoring variable needs to be changed to factor
#Checking for outliers and NA values
summary(tutoring_data)
## student_ID tutoring absences score.t1 score.t2
## Min. : 1.00 Mode :logical Min. : 1.200 Min. :17.31 Min. : 11.92
## 1st Qu.: 51.25 FALSE:101 1st Qu.: 3.600 1st Qu.:46.56 1st Qu.: 46.47
## Median :101.50 TRUE :101 Median : 6.000 Median :53.96 Median : 55.26
## Mean :101.50 Mean : 7.012 Mean :53.89 Mean : 56.23
## 3rd Qu.:151.75 3rd Qu.: 8.400 3rd Qu.:62.83 3rd Qu.: 65.01
## Max. :202.00 Max. :100.000 Max. :88.46 Max. :200.00
## NA's :1
There seems to be some outliers and NA values present
#Checking for duplicate entries of student ID and filtering for unique IDs
tutoring_data_new <- tutoring_data %>%
group_by(student_ID) %>%
filter(!duplicated(student_ID)) %>%
ungroup(student_ID)
#Checking for number of rows removed
nrow(tutoring_data) - nrow(tutoring_data_new) #No duplicates were found for this data
## [1] 0
#Checking the distribution of each variable
grid.arrange(ggplot(tutoring_data_new, aes(absences)) +
geom_histogram(binwidth = 0.5) +
labs(x = "Absences", y = "Count", title = "Distribution of absences"),
ggplot(tutoring_data_new, aes(score.t1)) +
geom_histogram(binwidth = 0.5) +
labs(x = "Scores", y = "Count", title = "Distribution of scores before tutoring scheme"),
ggplot(tutoring_data_new, aes(score.t2)) +
geom_histogram(binwidth = 0.5) +
labs(x = "Scores", y = "Count", title = "Distribution of scores after tutoring scheme")
)
The distributions confirm the presence of some outliers in score.t2 and absences
#Preparing the data based on observation of structures and visualizations
tutoring_data_new$tutoring <- factor(tutoring_data_new$tutoring,
levels = c("FALSE", "TRUE"),
labels = c("Non-Tutored", "Tutored")) #Converting into categorical data
levels(tutoring_data_new$tutoring) #Checking the levels of the factors
## [1] "Non-Tutored" "Tutored"
tutoring_data_new <- tutoring_data_new %>%
filter(score.t2 <= 100 & absences < 25 & !is.na(score.t2)) %>% #Removing NA values and outliers before analysis
arrange(student_ID) #Arranging data by student ID
str(tutoring_data_new) #Checking structure again to see the corrections
## tibble [200 × 5] (S3: tbl_df/tbl/data.frame)
## $ student_ID: num [1:200] 1 2 3 4 5 6 7 8 9 10 ...
## $ tutoring : Factor w/ 2 levels "Non-Tutored",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ absences : num [1:200] 4.8 6 20.4 7.2 7.2 ...
## $ score.t1 : num [1:200] 39.4 51.8 44.6 56.7 43.4 ...
## $ score.t2 : num [1:200] 30.8 52.1 53.1 55.3 55 ...
summary(tutoring_data_new)
## student_ID tutoring absences score.t1 score.t2
## Min. : 1.00 Non-Tutored:100 Min. : 1.200 Min. :17.31 Min. :11.92
## 1st Qu.: 50.75 Tutored :100 1st Qu.: 3.600 1st Qu.:46.30 1st Qu.:46.45
## Median :100.50 Median : 6.000 Median :53.96 Median :55.20
## Mean :100.50 Mean : 6.552 Mean :53.85 Mean :55.51
## 3rd Qu.:150.25 3rd Qu.: 8.400 3rd Qu.:62.87 3rd Qu.:64.88
## Max. :200.00 Max. :20.400 Max. :88.46 Max. :93.21
#Producing summary statistics for each item
tutoring_summary <- tutoring_data_new %>%
summarise(mean_absence = mean(absences), sd_absence = sd(absences),
mean_score.t1 = mean(score.t1), sd_score.t1 = sd(score.t1),
mean_score.t2 = mean(score.t2), sd_score.t2 = sd(score.t2))
#Assigning the values to individual variables
mean_absence <- tutoring_summary$mean_absence
mean_score.t1 <- tutoring_summary$mean_score.t1
mean_score.t2 <- tutoring_summary$mean_score.t2
sd_absence <- tutoring_summary$sd_absence
sd_score.t1 <- tutoring_summary$sd_score.t1
sd_score.t2 <- tutoring_summary$sd_score.t2
#Comparing each variable to a normal distribution for each category
grid.arrange((ggplot(tutoring_data_new, aes(x=absences)) +
geom_histogram(aes(y=..density..)) +
stat_function(fun=function(x) {dnorm(x, mean=mean_absence, sd=sd_absence)}) +
geom_vline(xintercept = mean_absence, color = "Blue") +
facet_wrap(~tutoring) +
labs(x="Absences", y="Density", title = "Comparison for Absences data")),
(ggplot(tutoring_data_new, aes(x=score.t1)) +
geom_histogram(aes(y=..density..)) +
stat_function(fun=function(x) {dnorm(x, mean=mean_score.t1, sd=sd_score.t1)}) +
geom_vline(xintercept = mean_score.t1, color = "Blue") +
facet_wrap(~tutoring) +
labs(x="Score Before Scheme", y="Density", title = "Comparison for Scores before Scheme")),
(ggplot(tutoring_data_new, aes(x=score.t2)) +
geom_histogram(aes(y=..density..)) +
stat_function(fun=function(x) {dnorm(x, mean=mean_score.t1, sd=sd_score.t2)}) +
geom_vline(xintercept = mean_score.t2, color = "Blue") +
facet_wrap(~tutoring) +
labs(x="Score After Scheme", y="Density", title = "Comparison for Scores after Scheme")),
nrow = 3)
The distributions seem fairly normal except the distribution for absence data seems slightly positively skewed. We proceed with our analysis
NHST Approach
#Performing t-test
( scores.before.t.test <- t.test(score.t1 ~ tutoring, tutoring_data_new) )
##
## Welch Two Sample t-test
##
## data: score.t1 by tutoring
## t = -1.0467, df = 196.54, p-value = 0.2965
## alternative hypothesis: true difference in means between group Non-Tutored and group Tutored is not equal to 0
## 95 percent confidence interval:
## -5.433861 1.665720
## sample estimates:
## mean in group Non-Tutored mean in group Tutored
## 52.90345 54.78753
Estimation Approach
#Creating linear model
scores.before.lm <- lm(score.t1 ~ tutoring, tutoring_data_new)
#Extracting means and 95% confidence intervals
scores.before.emm <- emmeans(scores.before.lm, ~tutoring)
kable(scores.before.emm, caption = "Mean scores and 95% CIs ")
| tutoring | emmean | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Non-Tutored | 52.90345 | 1.272791 | 198 | 50.39349 | 55.41342 |
| Tutored | 54.78753 | 1.272791 | 198 | 52.27756 | 57.29749 |
#Estimating the differences between means
scores.before.contrast <- confint(pairs(scores.before.emm, reverse = TRUE))
kable(scores.before.contrast, caption = "Differences between the mean scores before the scheme")
| contrast | estimate | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Tutored - (Non-Tutored) | 1.884071 | 1.799998 | 198 | -1.665557 | 5.433699 |
#Visualizing the estimations
avg.scores.before <- grid.arrange(
ggplot(summary(scores.before.emm), aes(y=emmean, x=tutoring, ymin=lower.CL, ymax=upper.CL, color = tutoring)) +
geom_point() +
geom_linerange() +
geom_hline(yintercept=0, lty=2) +
labs(x="Students", y="Scores", color = "Tutoring",
subtitle="Error bars are 95% CIs", title="Scores Before the Scheme") +
ylim(-2, 60),
ggplot(scores.before.contrast, aes(y=estimate, x=contrast, ymin=lower.CL, ymax=upper.CL)) +
geom_point() +
geom_linerange() +
labs(x="Students", y="Difference in scores",
subtitle="Error bars are 95% CIs", title="Difference in Score before the Scheme") +
geom_hline(yintercept=0, lty=2) +
ylim(-2, 60),
ncol=2, widths = c(2,1.75))
NHST Approach
#Performing t.test
( absences.t.test <- t.test(absences ~ tutoring, tutoring_data_new) )
##
## Welch Two Sample t-test
##
## data: absences by tutoring
## t = -0.98528, df = 197.6, p-value = 0.3257
## alternative hypothesis: true difference in means between group Non-Tutored and group Tutored is not equal to 0
## 95 percent confidence interval:
## -1.440721 0.480721
## sample estimates:
## mean in group Non-Tutored mean in group Tutored
## 6.312 6.792
Estimation Approach
#Creating the linear model
absences.lm <- lm(absences ~ tutoring, tutoring_data_new)
summary(absences.lm)
##
## Call:
## lm(formula = absences ~ tutoring, data = tutoring_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.592 -2.712 -0.792 1.728 13.608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.3120 0.3445 18.323 <2e-16 ***
## tutoringTutored 0.4800 0.4872 0.985 0.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.445 on 198 degrees of freedom
## Multiple R-squared: 0.004879, Adjusted R-squared: -0.0001469
## F-statistic: 0.9708 on 1 and 198 DF, p-value: 0.3257
#Checking for interactivity
absences.lm.scores <- lm(absences ~ tutoring + score.t1 + score.t2, tutoring_data_new)
vif(absences.lm.scores)
## tutoring score.t1 score.t2
## 1.133473 6.007094 6.248181
The vif scores for both score.t1 and score.t2 seem similar but are over 5. Which means the additional complexity of model is warranted to be investigated
#Creating multiple regression model including score
absences.lm.score.t1 <- lm(absences ~ tutoring + score.t1, tutoring_data_new)
summary(absences.lm.score.t1)
##
## Call:
## lm(formula = absences ~ tutoring + score.t1, data = tutoring_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2682 -2.2693 -0.6267 1.8597 12.6464
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.31919 1.00901 11.218 < 2e-16 ***
## tutoringTutored 0.65832 0.45883 1.435 0.153
## score.t1 -0.09465 0.01807 -5.239 4.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.235 on 197 degrees of freedom
## Multiple R-squared: 0.1266, Adjusted R-squared: 0.1177
## F-statistic: 14.27 on 2 and 197 DF, p-value: 1.624e-06
#Performing anova test to check whether the additional complexity improves the model
anova(absences.lm.scores, absences.lm.score.t1) #Inclusion of both score.t1 and score.t2 doesn't improve the model
## Analysis of Variance Table
##
## Model 1: absences ~ tutoring + score.t1 + score.t2
## Model 2: absences ~ tutoring + score.t1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 196 2056.0
## 2 197 2062.3 -1 -6.2715 0.5979 0.4403
anova(absences.lm, absences.lm.score.t1) #Inclusion of score.t1 does improve the model
## Analysis of Variance Table
##
## Model 1: absences ~ tutoring
## Model 2: absences ~ tutoring + score.t1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 2349.6
## 2 197 2062.3 1 287.34 27.448 4.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Extracting means and 95% confidence intervals
absences.emm <- emmeans(absences.lm, ~tutoring)
kable(absences.emm, caption = "Mean absence rates and their 95% CIs by tutoring")
| tutoring | emmean | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Non-Tutored | 6.312 | 0.3444817 | 198 | 5.632676 | 6.991324 |
| Tutored | 6.792 | 0.3444817 | 198 | 6.112676 | 7.471324 |
#Estimating the difference in means and 95% confidence interval
absences.contrast <- confint(pairs(absences.emm, reverse = TRUE))
kable(absences.contrast, caption = "Differences in the absence rates by tutoring")
| contrast | estimate | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Tutored - (Non-Tutored) | 0.48 | 0.4871707 | 198 | -0.4807091 | 1.440709 |
#Visualizing the estimations
grid2.1 <- grid.arrange(
ggplot(summary(absences.emm), aes(x = tutoring, y = emmean, ymin = lower.CL, ymax = upper.CL, color = tutoring)) +
geom_point() +
geom_linerange() +
labs(y = "Absence Rate", x = "Students", color = "Tutoring",
subtitle = "Error bars are 95% CIs", title = "Mean absence rates") +
geom_hline(yintercept=0, lty=2) +
ylim(-1.5, 8) +
coord_flip(),
ggplot(absences.contrast, aes(y=estimate, x=contrast, ymin=lower.CL, ymax=upper.CL)) +
geom_point() +
geom_linerange() +
labs(y="Difference in Absence Rates", x="Students",
subtitle="Error bars are 95% CIs", title="Difference in absence rates") +
geom_hline(yintercept=0, lty=2) +
ylim(-1.5, 8) +
coord_flip(),
ncol=2)
#Extracting means and 95% confidence intervals from the model including main effect of score.t1
absences.emm.scores <- emmeans(absences.lm.score.t1, ~tutoring)
kable(absences.emm.scores, caption = "Mean absence rates and their 95% CIs by tutoring and score main effects")
| tutoring | emmean | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Non-Tutored | 6.222839 | 0.3239965 | 197 | 5.583892 | 6.861785 |
| Tutored | 6.881161 | 0.3239965 | 197 | 6.242215 | 7.520108 |
#Estimating the difference in means and 95% confidence interval from the model including main effect of score.t1
absences.contrast.scores <- confint(pairs(absences.emm.scores, reverse = TRUE))
kable(absences.contrast.scores, caption = "Differences in the absence rates by tutoring and score main effects")
| contrast | estimate | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Tutored - (Non-Tutored) | 0.6583228 | 0.458832 | 197 | -0.2465301 | 1.563176 |
grid2.2 <- grid.arrange(
ggplot(summary(absences.emm.scores), aes(x = tutoring, y = emmean, ymin = lower.CL, ymax = upper.CL, color = tutoring)) +
geom_point() +
geom_linerange() +
labs(y = "Absence Rate", x = "Students", color = "Tutoring",
title = "Mean absence rates along with main effect of score") +
geom_hline(yintercept=0, lty=2) +
ylim(-1.5, 8) +
coord_flip(),
ggplot(absences.contrast.scores, aes(y=estimate, x=contrast, ymin=lower.CL, ymax=upper.CL)) +
geom_point() +
geom_linerange() +
labs(y="Difference in Absence Rates", x="Students",
title="Difference in absence rates along with main effect of score") +
geom_hline(yintercept=0, lty=2) +
ylim(-1.5, 8) +
coord_flip(),
ncol=2)
grid.arrange(grid2.1, grid2.2, nrow = 2, top = textGrob("Comparison between the single predictor model and the main effects model",gp=gpar(fontsize=20,font=3)))
Data Preparation
#Creating a new column to find score differences
tutoring_data_new <- tutoring_data_new %>%
mutate(score.diff = (score.t2 - score.t1))
#Finding the summary statistics
summary_new <- tutoring_data_new %>% group_by(tutoring) %>% dplyr::summarise(mean_diff = mean(score.diff))
NHST Approach
#Performing t-test
(scores.diff.t.test <- t.test(score.diff ~ tutoring, tutoring_data_new) )
##
## Welch Two Sample t-test
##
## data: score.diff by tutoring
## t = -5.0811, df = 194.29, p-value = 8.78e-07
## alternative hypothesis: true difference in means between group Non-Tutored and group Tutored is not equal to 0
## 95 percent confidence interval:
## -5.837998 -2.573164
## sample estimates:
## mean in group Non-Tutored mean in group Tutored
## -0.4399544 3.7656265
Estimation Approach
#Creating the linear model
scores_diff_lm <- lm(score.diff ~ tutoring, tutoring_data_new)
#Extracting the means and 95% CIs
scores_diff_emm <- emmeans(scores_diff_lm, ~ tutoring)
kable(scores_diff_emm, caption = "Score difference means and 95% CIs by tutoring")
| tutoring | emmean | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Non-Tutored | -0.4399544 | 0.5852675 | 198 | -1.594112 | 0.7142034 |
| Tutored | 3.7656265 | 0.5852675 | 198 | 2.611469 | 4.9197843 |
#Estimating the difference in means and 95% CI
scores_diff_contrast <- confint(pairs(scores_diff_emm, reverse = TRUE))
kable(scores_diff_contrast, caption = "Difference in the means of score changes by tutoring")
| contrast | estimate | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Tutored - (Non-Tutored) | 4.205581 | 0.8276933 | 198 | 2.573355 | 5.837807 |
#Visualizing the estimations
grid2 <- grid.arrange(
ggplot(summary(scores_diff_emm), aes(x=tutoring, y=emmean, ymin=lower.CL, ymax=upper.CL, color = tutoring)) +
geom_point() +
geom_linerange() +
labs(y="Scores", x="Students", color = "Tutoring",
subtitle="Error bars are 95% CIs", title="Mean Score differences after the Scheme") +
geom_hline(yintercept=0, lty=2) +
ylim(-2.5, 7.5),
ggplot(scores_diff_contrast, aes(y=estimate, x=contrast, ymin=lower.CL, ymax=upper.CL)) +
geom_point() +
geom_linerange() +
labs(y="Difference in Scores", x="Students",
subtitle="Error bars are 95% CIs", title="Difference in Score changes after the Scheme") +
geom_hline(yintercept=0, lty=2) +
ylim(-2.5, 7.5),
ncol=2)
#Creating linear model with tutoring and absences main effects
scores.diff.absences.lm <- lm(score.diff ~ tutoring + absences, tutoring_data_new)
summary(scores.diff.absences.lm)
##
## Call:
## lm(formula = score.diff ~ tutoring + absences, data = tutoring_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5498 -3.5452 -0.2211 3.4694 15.7400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3101 0.9610 0.323 0.747
## tutoringTutored 4.2626 0.8298 5.137 6.69e-07 ***
## absences -0.1188 0.1208 -0.984 0.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.853 on 197 degrees of freedom
## Multiple R-squared: 0.1197, Adjusted R-squared: 0.1107
## F-statistic: 13.39 on 2 and 197 DF, p-value: 3.525e-06
#Using anova to check if including absences improves the model
anova(scores_diff_lm, scores.diff.absences.lm)
## Analysis of Variance Table
##
## Model 1: score.diff ~ tutoring
## Model 2: score.diff ~ tutoring + absences
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 6782.3
## 2 197 6749.1 1 33.177 0.9684 0.3263
The inclusion of main effects of tutoring and absences do not improve the model according to the anova test
#Creating linear model with tutoring and absences having interaction
scores.diff.absences.lm.inter <- lm(score.diff ~ tutoring * absences, tutoring_data_new)
summary(scores.diff.absences.lm.inter)
##
## Call:
## lm(formula = score.diff ~ tutoring * absences, data = tutoring_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6208 -3.5477 -0.2268 3.5930 15.7730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.42453 1.25171 0.339 0.7349
## tutoringTutored 4.03560 1.79026 2.254 0.0253 *
## absences -0.13696 0.17517 -0.782 0.4352
## tutoringTutored:absences 0.03471 0.24235 0.143 0.8863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.868 on 196 degrees of freedom
## Multiple R-squared: 0.1198, Adjusted R-squared: 0.1063
## F-statistic: 8.89 on 3 and 196 DF, p-value: 1.495e-05
#Using anova to check if interactivity improves the model with no interaction
anova(scores_diff_lm, scores.diff.absences.lm.inter)
## Analysis of Variance Table
##
## Model 1: score.diff ~ tutoring
## Model 2: score.diff ~ tutoring * absences
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 6782.3
## 2 196 6748.4 2 33.883 0.492 0.6121
The inclusion of interactive effect of tutoring and absences also do not improve the model according to the anova test
In order to check whether the students allocated to the tutored and non-tutored groups had similar or different average test scores before the tutoring scheme began, we performed a two-sample t-test to predict their scores(score.t1) by tutoring status:
##
## Welch Two Sample t-test
##
## data: score.t1 by tutoring
## t = -1.0467, df = 196.54, p-value = 0.2965
## alternative hypothesis: true difference in means between group Non-Tutored and group Tutored is not equal to 0
## 95 percent confidence interval:
## -5.433861 1.665720
## sample estimates:
## mean in group Non-Tutored mean in group Tutored
## 52.90345 54.78753
We conclude from our sample of 200 students that the mean score for tutored student group is not significantly greater than that of non-tutored student group, Welch t(197) = 1.05, p = 0.2965.
For getting a better estimation of average test scores before the tutoring scheme, we look at the following means and confidence intervals:
| tutoring | emmean | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Non-Tutored | 52.90345 | 1.272791 | 198 | 50.39349 | 55.41342 |
| Tutored | 54.78753 | 1.272791 | 198 | 52.27756 | 57.29749 |
| contrast | estimate | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Tutored - (Non-Tutored) | 1.884071 | 1.799998 | 198 | -1.665557 | 5.433699 |
The mean score for non-tutored students is 52.9 95% CI [50.4 - 55.4]. The mean score for tutored students is 54.8 95% CI [52.3 - 57.3]. The difference is \(1.88\) 95% CI [-1.7 - 5.4] greater for the tutored student group.
Thus, both the p-value and the CI of difference of the scores of the two groups indicate that we cannot say conclusively whether students allocated to the tutored and non-tutored groups had similar or different scores before the tutoring scheme began.
In order to check whether the tutored and non-tutored groups had similar or different rates of absences on average, we performed a two-sample t-test to predict their absences by tutoring status:
##
## Welch Two Sample t-test
##
## data: absences by tutoring
## t = -0.98528, df = 197.6, p-value = 0.3257
## alternative hypothesis: true difference in means between group Non-Tutored and group Tutored is not equal to 0
## 95 percent confidence interval:
## -1.440721 0.480721
## sample estimates:
## mean in group Non-Tutored mean in group Tutored
## 6.312 6.792
We conclude from our sample of 200 students that the mean absence rate for tutored student group is not significantly greater than that of non-tutored student group, Welch t(197) = 0.98, p = 0.3257.
For getting a better estimation of average absence rates, we look at the following means and confidence intervals:
| tutoring | emmean | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Non-Tutored | 6.312 | 0.3444817 | 198 | 5.632676 | 6.991324 |
| Tutored | 6.792 | 0.3444817 | 198 | 6.112676 | 7.471324 |
| contrast | estimate | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| Tutored - (Non-Tutored) | 0.48 | 0.4871707 | 198 | -0.4807091 | 1.440709 |
The mean absence rate for non-tutored students is 6.3 95% CI [5.6 - 6.9]. The mean absence rate for tutored students is 6.8 95% CI [6.1 - 7.5]. The difference is 0.48 95% CI [-0.5 - 1.4] greater for the tutored student group.
Thus, both the p-value and the CI of difference of the scores of the two groups indicate that we cannot say conclusively whether tutored and non-tutored groups had similar or different rates of absences on average.
Further analysis showed that there is merit to include additional complexity in our model by using the main effect of either of the student scores along with the tutoring status while predicting the absence rates. However, the better model, though it changes the means and the 95% CIs, cannot also draw any different conclusion.
In order to check whether tutored students show an increase in their scores compared to the students who did not receive tutoring, we found the difference between their scores at the beginning and the end of the academic year and performed two-sample t-test and estimation process on the data:
##
## Welch Two Sample t-test
##
## data: score.diff by tutoring
## t = -5.0811, df = 194.29, p-value = 8.78e-07
## alternative hypothesis: true difference in means between group Non-Tutored and group Tutored is not equal to 0
## 95 percent confidence interval:
## -5.837998 -2.573164
## sample estimates:
## mean in group Non-Tutored mean in group Tutored
## -0.4399544 3.7656265
We conclude from the t-test on our sample of 200 students that the mean score difference for tutored student group is significantly greater than that of non-tutored student group, Welch t(194) = 5.08, p < 0.05.
For getting a better estimation of average absence rates, we look at the following means and confidence intervals:
## tutoring emmean SE df lower.CL upper.CL
## Non-Tutored -0.44 0.585 198 -1.59 0.714
## Tutored 3.77 0.585 198 2.61 4.920
##
## Confidence level used: 0.95
## contrast estimate SE df lower.CL upper.CL
## Tutored - (Non-Tutored) 4.21 0.828 198 2.57 5.84
##
## Confidence level used: 0.95
The mean score decrease for non-tutored students is 0.44 95% CI [-1.59 - 0.71]. The mean score increase for tutored students is 3.77 95% CI [2.61 - 4.92]. The difference is a score of 4.21 95% CI [2.57 - 5.84] greater for the tutored student group.
Thus, both the p-value and the CI of difference of the scores of the two groups indicate that we can say conclusively that tutored students show an increase in their scores compared to the non-tutored students after the tutoring scheme was implemented.
In order to check for the effect of absences on the change in scores, we first check how the scores behave when either tutoring status or absence rate is held constant (i.e. not changing), then we observe for for any interaction between tutoring status and absence rate:
##
## Call:
## lm(formula = score.diff ~ tutoring + absences, data = tutoring_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5498 -3.5452 -0.2211 3.4694 15.7400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3101 0.9610 0.323 0.747
## tutoringTutored 4.2626 0.8298 5.137 6.69e-07 ***
## absences -0.1188 0.1208 -0.984 0.326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.853 on 197 degrees of freedom
## Multiple R-squared: 0.1197, Adjusted R-squared: 0.1107
## F-statistic: 13.39 on 2 and 197 DF, p-value: 3.525e-06
Thus, the results of the regression show that there is no significant main effect of absence rate on scores (tutoring = -0.11, t(197) = 0.98, p = 0.326) but there was a significant main effect of tutoring on scores (absences = 4.26, t(197) = 5.14, p < 0.0001)
## Analysis of Variance Table
##
## Model 1: score.diff ~ tutoring
## Model 2: score.diff ~ tutoring + absences
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 6782.3
## 2 197 6749.1 1 33.177 0.9684 0.3263
This is supported by the our anova test, which indicates the model is not significantly improved by the additional complexity of including the effect of absence.
Now, we check for any interactivity between tutoring status and absence rate:
##
## Call:
## lm(formula = score.diff ~ tutoring * absences, data = tutoring_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6208 -3.5477 -0.2268 3.5930 15.7730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.42453 1.25171 0.339 0.7349
## tutoringTutored 4.03560 1.79026 2.254 0.0253 *
## absences -0.13696 0.17517 -0.782 0.4352
## tutoringTutored:absences 0.03471 0.24235 0.143 0.8863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.868 on 196 degrees of freedom
## Multiple R-squared: 0.1198, Adjusted R-squared: 0.1063
## F-statistic: 8.89 on 3 and 196 DF, p-value: 1.495e-05
Thus, the results of the regression show that there is no significant main effect of absence rate on scores (tutoring = -0.14, t(196) = 0.78, p = 0.4352) but there was a significant main effect of tutoring on scores (absences = 4.04, t(196) = 2.25, p = 0.0253). There was also no significant interaction between tutoring status and absence, with the positive effect of absence being larger when tutoring status was ‘Tutored’ (absences = 0.03, t(196) = 0.14, p = 0.8863)
## Analysis of Variance Table
##
## Model 1: score.diff ~ tutoring
## Model 2: score.diff ~ tutoring * absences
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 6782.3
## 2 196 6748.4 2 33.883 0.492 0.6121
This conclusion is also supported by the anova test, which indicates the model is not significantly improved by the additional complexity of including the interactivity effect of absence.
Thus, we can conclude by saying that there was no significant effect of absences on change in scores, neither did it have any interaction with the effect of tutoring.
| Variable | Description |
|---|---|
| Name | The name of the beer |
| Style | The style of the beer |
| Brewery | The name of the manufacturer of the beer |
| ABV | Abbreviation for ‘Alcohol by volume’ - indicates how much of the total volume of liquid in a beer is made up of alcohol |
| rating | The rating of the beer in a scale of 1-5 |
| minIBU | Abbreviation for ‘International Bitterness Units’ - indicates the minimum level of a beer’s bitterness |
| maxIBU | Abbreviation for ‘International Bitterness Units’ - indicates the maximum level of a beer’s bitterness |
| Astringency | Beer astringency is an off flavor and is perceived as a dry grainy, mouth-puckering, tannic sensation. |
| Body | Describes how heavy or light the beer is |
| Alcohol | The alcohol content in the beer |
| Bitter | The bitterness of a beer |
| Sweet | The sweetness of a beer |
| Sour | The sourness of a beer |
| Salty | The saltiness of a beer |
| Fruits | The fruitiness of a beer |
| Hoppy | The Hops content of a beer |
| Spices | The Spice content of a beer |
| Malty | The Malt content of a beer |
#Reading the file into R
beer_data <- read_csv("Craft-Beer_data_set.txt")
#Checking the structure of the dataset
str(beer_data)
## spec_tbl_df [5,558 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Name : chr [1:5558] "Amber" "Double Bag" "Long Trail Ale" "Doppelsticke" ...
## $ Style : chr [1:5558] "Altbier" "Altbier" "Altbier" "Altbier" ...
## $ Brewery : chr [1:5558] "Alaskan Brewing Co." "Long Trail Brewing Co." "Long Trail Brewing Co." "Uerige Obergärige Hausbrauerei" ...
## $ ABV : num [1:5558] 5.3 7.2 5 8.5 5.3 7.2 6 5.3 5 4.8 ...
## $ rating : num [1:5558] 3.65 3.9 3.58 4.15 3.67 3.78 4.1 3.46 3.6 4.1 ...
## $ minIBU : num [1:5558] 25 25 25 25 25 25 25 25 25 25 ...
## $ maxIBU : num [1:5558] 50 50 50 50 50 50 50 50 50 50 ...
## $ Astringency: num [1:5558] 13 12 14 13 21 25 22 28 18 25 ...
## $ Body : num [1:5558] 32 57 37 55 69 51 45 40 49 35 ...
## $ Alcohol : num [1:5558] 9 18 6 31 10 26 13 3 5 4 ...
## $ Bitter : num [1:5558] 47 33 42 47 63 44 46 40 37 38 ...
## $ Sweet : num [1:5558] 74 55 43 101 120 45 62 58 73 39 ...
## $ Sour : num [1:5558] 33 16 11 18 14 9 25 29 22 13 ...
## $ Salty : num [1:5558] 0 0 0 1 0 1 1 0 0 1 ...
## $ Fruits : num [1:5558] 33 24 10 49 19 11 34 36 21 8 ...
## $ Hoppy : num [1:5558] 57 35 54 40 36 51 60 54 37 60 ...
## $ Spices : num [1:5558] 8 12 4 16 15 20 4 8 4 16 ...
## $ Malty : num [1:5558] 111 84 62 119 218 95 103 97 98 97 ...
## - attr(*, "spec")=
## .. cols(
## .. Name = col_character(),
## .. Style = col_character(),
## .. Brewery = col_character(),
## .. ABV = col_double(),
## .. rating = col_double(),
## .. minIBU = col_double(),
## .. maxIBU = col_double(),
## .. Astringency = col_double(),
## .. Body = col_double(),
## .. Alcohol = col_double(),
## .. Bitter = col_double(),
## .. Sweet = col_double(),
## .. Sour = col_double(),
## .. Salty = col_double(),
## .. Fruits = col_double(),
## .. Hoppy = col_double(),
## .. Spices = col_double(),
## .. Malty = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#Checking for any NA values
summary(beer_data)
## Name Style Brewery ABV rating
## Length:5558 Length:5558 Length:5558 Min. : 0.000 Min. :1.27
## Class :character Class :character Class :character 1st Qu.: 5.000 1st Qu.:3.59
## Mode :character Mode :character Mode :character Median : 6.000 Median :3.82
## Mean : 6.634 Mean :3.76
## 3rd Qu.: 7.900 3rd Qu.:4.04
## Max. :57.500 Max. :4.83
## minIBU maxIBU Astringency Body Alcohol
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:10.00 1st Qu.: 25.00 1st Qu.: 8.00 1st Qu.: 25.00 1st Qu.: 5.00
## Median :20.00 Median : 35.00 Median :14.00 Median : 38.00 Median : 10.00
## Mean :20.72 Mean : 38.45 Mean :15.94 Mean : 42.75 Mean : 15.98
## 3rd Qu.:25.00 3rd Qu.: 45.00 3rd Qu.:22.00 3rd Qu.: 55.00 3rd Qu.: 20.00
## Max. :65.00 Max. :100.00 Max. :83.00 Max. :197.00 Max. :139.00
## Bitter Sweet Sour Salty Fruits
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 13.00 1st Qu.: 27.00 1st Qu.: 9.00 1st Qu.: 0.000 1st Qu.: 10.00
## Median : 29.00 Median : 49.50 Median : 21.00 Median : 0.000 Median : 28.00
## Mean : 34.32 Mean : 53.63 Mean : 34.61 Mean : 1.314 Mean : 39.38
## 3rd Qu.: 51.00 3rd Qu.: 74.00 3rd Qu.: 44.00 3rd Qu.: 1.000 3rd Qu.: 61.75
## Max. :150.00 Max. :263.00 Max. :323.00 Max. :66.000 Max. :222.00
## Hoppy Spices Malty
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 14.00 1st Qu.: 4.00 1st Qu.: 33.00
## Median : 30.00 Median : 9.00 Median : 65.00
## Mean : 38.41 Mean : 17.58 Mean : 68.59
## 3rd Qu.: 56.00 3rd Qu.: 22.00 3rd Qu.: 99.00
## Max. :193.00 Max. :184.00 Max. :304.00
#Removing NA values
beer_data <- na.omit(beer_data)
#Checking for duplicate data
nrow(distinct(beer_data)) #No duplicates as the distinct entries are equal to the number of rows in the dataset
## [1] 5556
#If duplicates existed, then the following code could be used to remove them:
#beer_data <- beer_data[which(beer_data == distinct(beer_data)),]
#Renaming the categories of beer according to requirement
beer_data[grepl("IPA", beer_data$Style), "Style"] <- "IPA"
beer_data[grepl("Lager", beer_data$Style), "Style"] <- "Lager"
beer_data[grepl("Porter", beer_data$Style), "Style"] <- "Porter"
beer_data[grepl("Stout", beer_data$Style), "Style"] <- "Stout"
beer_data[grepl("Wheat", beer_data$Style), "Style"] <- "Wheat"
beer_data[grepl("Pale", beer_data$Style), "Style"] <- "Pale"
beer_data[grepl("Pilsner", beer_data$Style), "Style"] <- "Pilsner"
beer_data[grepl("Bock", beer_data$Style), "Style"] <- "Bock"
beer_data[beer_data$Style!= "IPA" &
beer_data$Style != "Lager" &
beer_data$Style != "Porter" &
beer_data$Style != "Stout" &
beer_data$Style != "Wheat" &
beer_data$Style != "Pale" &
beer_data$Style != "Pilsner" &
beer_data$Style != "Bock" , "Style"] <- "Other"
#Converting the beer category column into factor
beer_data$Style <- as.factor(beer_data$Style)
#Checking if the levels of factors are set properly
levels(beer_data$Style)
## [1] "Bock" "IPA" "Lager" "Other" "Pale" "Pilsner" "Porter" "Stout" "Wheat"
#Generating summary statistics
beer_data_summary <- beer_data %>%
group_by(Style) %>%
summarise(mean_rating = mean(rating), std_rating = sd(rating))
#Storing the summary values to individual variables
rating_sd <- beer_data_summary$std_rating
rating_mean <- beer_data_summary$mean_rating
#Visualizing the distribution of rating data by beer categories and comparing to normal distribution
ggplot(beer_data, aes(x=rating)) +
geom_histogram(aes(y=..density..), binwidth = 0.05) +
stat_function(fun=function(x) {dnorm(x, mean=rating_mean, sd=rating_sd)}) +
geom_vline(xintercept = beer_data_summary$mean_rating, color = "Green") +
facet_wrap(~Style) +
labs(x="Ratings", y="Density",
title = "Distribution of Ratings by Beer Categories")
The distributions seem fairly normal, except a few show slightly positive or negative skewness. Overall, the distrubutions seems fit to continue with our analysis
Estimation Approach
#Creating the linear model
(beer.lm <- lm(rating ~ Style, beer_data) )
##
## Call:
## lm(formula = rating ~ Style, data = beer_data)
##
## Coefficients:
## (Intercept) StyleIPA StyleLager StyleOther StylePale StylePilsner StylePorter
## 3.811520 0.217909 -0.454156 -0.004102 -0.037400 -0.121187 0.155880
## StyleStout StyleWheat
## 0.187530 -0.100320
#Extracting the means and 95% CIs
beer.emm <- emmeans(beer.lm, ~ Style)
#Storing the data as data frame and preparing it for reordering
beer.emm <- as.data.frame(beer.emm)
beer.emm <- beer.emm %>%
group_by(emmean) %>%
arrange(desc(emmean))
#Creating a table to show the estimations
(kable1 <- kable(beer.emm, caption = "Mean rating and 95% CIs within each category") )
| Style | emmean | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| IPA | 4.029429 | 0.0212219 | 5547 | 3.987825 | 4.071032 |
| Stout | 3.999050 | 0.0198512 | 5547 | 3.960134 | 4.037966 |
| Porter | 3.967400 | 0.0229222 | 5547 | 3.922463 | 4.012337 |
| Bock | 3.811520 | 0.0251101 | 5547 | 3.762294 | 3.860746 |
| Other | 3.807418 | 0.0077758 | 5547 | 3.792175 | 3.822662 |
| Pale | 3.774120 | 0.0251101 | 5547 | 3.724894 | 3.823346 |
| Wheat | 3.711200 | 0.0212219 | 5547 | 3.669597 | 3.752803 |
| Pilsner | 3.690333 | 0.0324169 | 5547 | 3.626783 | 3.753883 |
| Lager | 3.357364 | 0.0132415 | 5547 | 3.331405 | 3.383322 |
#Visualizing the data
ggplot(beer.emm, aes(x=reorder(Style, -emmean), #Arranging means in descending order
y=emmean, ymin=lower.CL, ymax=upper.CL,
color = Style)) +
geom_point() +
geom_hline(aes(yintercept=rating_mean), linetype = "dotted") +
geom_errorbar(width = 0.3, stat = "identity") +
labs(x="Category", y="Rating", color = "Category", fill = "Category",
subtitle="Error bars are 95% CIs", title = "Distribution of ratings for each beer category") +
geom_text(aes(label = round(emmean, 2)), hjust = 1.5, size = 3, color = "Black") + #Inserting labels on means
geom_violin(data=beer_data,
mapping=aes(x=Style, y=rating, ymin=NULL, ymax=NULL, color = Style, fill = Style),
alpha = 0.1) #Adding the distribution of the entire dataset
A linear model was used to predict beer rating by category. Using the model, the following mean ratings and 95% CIs were found for each category of beer:
| Style | emmean | SE | df | lower.CL | upper.CL |
|---|---|---|---|---|---|
| IPA | 4.029429 | 0.0212219 | 5547 | 3.987825 | 4.071032 |
| Stout | 3.999050 | 0.0198512 | 5547 | 3.960134 | 4.037966 |
| Porter | 3.967400 | 0.0229222 | 5547 | 3.922463 | 4.012337 |
| Bock | 3.811520 | 0.0251101 | 5547 | 3.762294 | 3.860746 |
| Other | 3.807418 | 0.0077758 | 5547 | 3.792175 | 3.822662 |
| Pale | 3.774120 | 0.0251101 | 5547 | 3.724894 | 3.823346 |
| Wheat | 3.711200 | 0.0212219 | 5547 | 3.669597 | 3.752803 |
| Pilsner | 3.690333 | 0.0324169 | 5547 | 3.626783 | 3.753883 |
| Lager | 3.357364 | 0.0132415 | 5547 | 3.331405 | 3.383322 |
A violin plot is used to show the distribution of the ratings within each category and the mean ratings and 95% CIs are shown using the error bars in the following figure:
#Generating the summary statistics
beer_data_summary2 <- beer_data %>% summarise(mean_abv = mean(ABV), sd_abv = sd(ABV),
mean_rating = mean(rating), sd_rating = sd(rating),
mean_minIBU = mean(minIBU), sd_minIBU = sd(minIBU),
mean_maxIBU = mean(maxIBU), sd_maxIBU = sd(maxIBU),
mean_Astringency = mean(Astringency), sd_Astrigency = sd(Astringency),
mean_Body = mean(Body), sd_Body = sd(Body),
mean_Alcohol = mean(Alcohol), sd_Alcohol = sd(Alcohol),
mean_Bitter = mean(Bitter), sd_Bitter = sd(Bitter),
mean_Sweet = mean(Sweet), sd_Sweet = sd(Sweet),
mean_Sour = mean(Sour), sd_Sour = sd(Sour),
mean_Salty = mean(Salty), sd_Salty = sd(Salty),
mean_Fruits = mean(Fruits), sd_Fruit = sd(Fruits),
mean_Hoppy = mean(Hoppy), sd_Hoppy = sd(Hoppy),
mean_Spices = mean(Spices), sd_Spices = sd(Spices),
mean_Malty = mean(Malty), sd_Malty = sd(Malty)
)
#Checking the distribution of all the related variables and comparing to their normal distributions
grid.arrange((ggplot(beer_data, aes(x=ABV)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_abv, sd=beer_data_summary2$sd_abv)}) +
labs(x="ABV", y="Density")) +
xlim(-5,30), #Not including the outliers for visualizing
(ggplot(beer_data, aes(x=rating)) +
geom_histogram(aes(y=..density..), binwidth = 0.05) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_rating, sd=beer_data_summary2$sd_rating)}) +
labs(x="Ratings", y="Density")),
(ggplot(beer_data, aes(x=minIBU)) +
geom_histogram(aes(y=..density..), binwidth = 2) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_minIBU, sd=beer_data_summary2$sd_minIBU)}) +
labs(x="minIBU", y="Density")),
(ggplot(beer_data, aes(x=maxIBU)) +
geom_histogram(aes(y=..density..), binwidth = 2) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_maxIBU, sd=beer_data_summary2$sd_maxIBU)}) +
labs(x="maxIBU", y="Density")),
(ggplot(beer_data, aes(x=Astringency)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Astringency, sd=beer_data_summary2$sd_Astrigency)}) +
labs(x="Astringency", y="Density")),
(ggplot(beer_data, aes(x=Body)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Body, sd=beer_data_summary2$sd_Body)}) +
labs(x="Body", y="Density")),
(ggplot(beer_data, aes(x=Alcohol)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Alcohol, sd=beer_data_summary2$sd_Alcohol)}) +
labs(x="Alcohol", y="Density")),
(ggplot(beer_data, aes(x=Bitter)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Bitter, sd=beer_data_summary2$sd_Bitter)}) +
labs(x="Bitter", y="Density")),
(ggplot(beer_data, aes(x=Sweet)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Sweet, sd=beer_data_summary2$sd_Sweet)}) +
labs(x="Sweet", y="Density")),
(ggplot(beer_data, aes(x=Sour)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Sour, sd=beer_data_summary2$sd_Sour)}) +
labs(x="Sour", y="Density")),
(ggplot(beer_data, aes(x=Salty)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Salty, sd=beer_data_summary2$sd_Salty)}) +
labs(x="Salty", y="Density")) +
xlim(-5, 20),
(ggplot(beer_data, aes(x=Fruits)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Fruits, sd=beer_data_summary2$sd_Fruit)}) +
labs(x="Fruits", y="Density")),
(ggplot(beer_data, aes(x=Hoppy)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Hoppy, sd=beer_data_summary2$sd_Hoppy)}) +
labs(x="Hoppy", y="Density")),
(ggplot(beer_data, aes(x=Spices)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Spices, sd=beer_data_summary2$sd_Spices)}) +
labs(x="Spices", y="Density")),
(ggplot(beer_data, aes(x=Malty)) +
geom_histogram(aes(y=..density..), binwidth = 1) +
stat_function(fun=function(x) {dnorm(x, mean=beer_data_summary2$mean_Malty, sd=beer_data_summary2$sd_Malty)}) +
labs(x="Malty", y="Density")),
top = textGrob("Distributions of each variable",gp=gpar(fontsize=15,font=3)))
We are more interested in the distributions of ratings, ABV, Sweet and Malty. Ratings and ABV seem fairly normal, however, Sweet and Malty have a lot of positive skewness. We will keep this in mind while performing the analysis.
#Checking correlation for relevant variables
rcorr(as.matrix(select(beer_data, rating, ABV, Sweet, Malty), type = "pearson"))
## rating ABV Sweet Malty
## rating 1.00 0.40 0.29 0.17
## ABV 0.40 1.00 0.40 0.19
## Sweet 0.29 0.40 1.00 0.56
## Malty 0.17 0.19 0.56 1.00
##
## n= 5556
##
##
## P
## rating ABV Sweet Malty
## rating 0 0 0
## ABV 0 0 0
## Sweet 0 0 0
## Malty 0 0 0
#Visualizing the correlations
grid.arrange(ggplot(beer_data, aes(x = ABV, y = rating)) +
geom_point() +
geom_smooth(method = lm) +
labs(y = "Rating"),
ggplot(beer_data, aes(x = Sweet, y = rating)) +
geom_point() +
geom_smooth(method = lm) +
labs(y = "Rating"),
ggplot(beer_data, aes(x = Malty, y = rating)) +
geom_point() +
geom_smooth(method = lm) +
labs(y = "Rating"),
ggplot(beer_data, aes(x = Sweet, y = Malty)) +
geom_point() +
geom_smooth(method = lm),
ggplot(beer_data, aes(x = ABV, y = Malty)) +
geom_point() +
geom_smooth(method = lm),
ggplot(beer_data, aes(x = ABV, y = Sweet)) +
geom_point() +
geom_smooth(method = lm),
top = textGrob("Visualizing correlations",gp=gpar(fontsize=20,font=3))
)
NHST Approach
#Creating the linear model with ABV as predictor
abv_lm <- lm(rating ~ ABV, beer_data)
summary(abv_lm)
##
## Call:
## lm(formula = rating ~ ABV, data = beer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3418 -0.1463 0.0526 0.2322 1.0088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.297235 0.015351 214.79 <2e-16 ***
## ABV 0.069819 0.002163 32.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4064 on 5554 degrees of freedom
## Multiple R-squared: 0.158, Adjusted R-squared: 0.1578
## F-statistic: 1042 on 1 and 5554 DF, p-value: < 2.2e-16
#Creating another linear model with no predictor
abv_base <- lm(rating ~ 1, beer_data)
#Checking whether the model is improved by the use of ABV as predictor
anova(abv_base, abv_lm)
## Analysis of Variance Table
##
## Model 1: rating ~ 1
## Model 2: rating ~ ABV
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 5555 1089.4
## 2 5554 917.3 1 172.11 1042.1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Comparison with the baseline linear model and the linear model with predictor in anova test shows that the additional complexity of including ABV makes our model significantly better.
Estimation Approach
#Extracting the coefficient and 95% CIs from the linear model
abv_est <- cbind(coef(abv_lm), confint(abv_lm))
#Using the linear model to generate rating predictions based on existing ABV data
beer_data <- beer_data %>% mutate(rating.hat = predict(abv_lm))
#Visualizing the residuals
ggplot(beer_data, aes(x = ABV, y = rating, ymin = rating, ymax = rating.hat)) +
geom_point() +
geom_linerange() +
geom_smooth(method = lm) +
geom_point(aes(y = rating.hat), shape = "x", size = 3) +
labs(y = "Ratings")
Sweet Flavor
#Creating linear model for Sweet flavor:
flavor_lm.s <- lm(rating ~ ABV + Sweet, beer_data)
summary(flavor_lm.s)
##
## Call:
## lm(formula = rating ~ ABV + Sweet, data = beer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6915 -0.1553 0.0460 0.2329 1.0836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.266784 0.015380 212.41 <2e-16 ***
## ABV 0.058734 0.002333 25.18 <2e-16 ***
## Sweet 0.001939 0.000164 11.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4014 on 5553 degrees of freedom
## Multiple R-squared: 0.1787, Adjusted R-squared: 0.1784
## F-statistic: 604 on 2 and 5553 DF, p-value: < 2.2e-16
#Creating linear model for Sweet flavor with interaction:
flavor_lm_inter.s <- lm(rating ~ ABV * Sweet, beer_data)
summary(flavor_lm_inter.s)
##
## Call:
## lm(formula = rating ~ ABV * Sweet, data = beer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1734 -0.1570 0.0453 0.2329 1.0866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1889925 0.0243314 131.065 < 2e-16 ***
## ABV 0.0700376 0.0035981 19.465 < 2e-16 ***
## Sweet 0.0034591 0.0004035 8.574 < 2e-16 ***
## ABV:Sweet -0.0002007 0.0000487 -4.122 3.81e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4008 on 5552 degrees of freedom
## Multiple R-squared: 0.1812, Adjusted R-squared: 0.1807
## F-statistic: 409.5 on 3 and 5552 DF, p-value: < 2.2e-16
#Checking to see which model is better
anova(flavor_lm.s, flavor_lm_inter.s)
## Analysis of Variance Table
##
## Model 1: rating ~ ABV + Sweet
## Model 2: rating ~ ABV * Sweet
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 5553 894.77
## 2 5552 892.04 1 2.73 16.991 3.81e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The anova test concludes that the additional complexity of including interaction effect of Sweet significantly improves the model. We will focus on this model while making interpretations.
#Creating a tibble
intr.surf.data.s <- tibble(ABV = unlist(expand.grid(seq(0, 100, 1), seq(0, 200, 5))[1]),
Sweet = unlist(expand.grid(seq(0, 100, 1), seq(0, 200, 5))[2]))
#Adding some prediction points
intr.surf.data.s <- mutate(intr.surf.data.s,
main.hat = predict(flavor_lm.s, intr.surf.data.s),
intr.hat = predict(flavor_lm_inter.s, intr.surf.data.s))
#Visualizing the surfaces
surf.main.s <- ggplot(intr.surf.data.s, aes(ABV, Sweet)) +
geom_contour_filled(aes(z = main.hat)) +
labs(subtitle = "Main Effects") +
guides(fill=guide_legend(title="Ratings"))
surf.intr.s <- ggplot(intr.surf.data.s, aes(ABV, Sweet)) +
geom_contour_filled(aes(z = intr.hat)) +
labs(subtitle = "Interaction Effects") +
guides(fill=guide_legend(title="Ratings"))
grid.arrange(surf.main.s, surf.intr.s, nrow = 2)
#Visualizing the predictions as constant ABV levels
(effect.s <- filter(intr.surf.data.s, ABV %in% c(2, 3, 4, 12, 13, 17 , 18, 26, 27, 28)) %>%
mutate(ABV = factor(ABV)) %>%
ggplot() +
geom_line(aes(Sweet, main.hat, colour = ABV), size = 1) +
geom_line(aes(Sweet, intr.hat, colour = ABV), linetype = "dashed", size = 1) + #, show.legend = FALSE
ylim(3,6) +
ylab("Rating"))
Malty Flavor
#Creating linear model for Malty flavor:
flavor_lm.m <- lm(rating ~ ABV + Malty, beer_data)
summary(flavor_lm.m)
##
## Call:
## lm(formula = rating ~ ABV + Malty, data = beer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1210 -0.1520 0.0417 0.2235 1.0655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2530643 0.0163266 199.249 < 2e-16 ***
## ABV 0.0666807 0.0021905 30.441 < 2e-16 ***
## Malty 0.0009474 0.0001238 7.653 2.31e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4043 on 5553 degrees of freedom
## Multiple R-squared: 0.1668, Adjusted R-squared: 0.1665
## F-statistic: 555.7 on 2 and 5553 DF, p-value: < 2.2e-16
#Creating linear model for with both flavors included
flavor_lm.m.s <- lm(rating ~ ABV + Malty + Sweet, beer_data)
vif(flavor_lm.s)
## ABV Sweet
## 1.192518 1.192518
vif(flavor_lm.m)
## ABV Malty
## 1.036326 1.036326
vif(flavor_lm.m.s)
## ABV Malty Sweet
## 1.195358 1.455948 1.675384
##Creating linear model for Malty flavor with interaction:
flavor_lm_inter.m <- lm(rating ~ ABV * Malty, beer_data)
summary(flavor_lm_inter.m)
##
## Call:
## lm(formula = rating ~ ABV * Malty, data = beer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4349 -0.1510 0.0430 0.2277 1.0475
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.352e+00 2.547e-02 131.606 < 2e-16 ***
## ABV 5.228e-02 3.590e-03 14.560 < 2e-16 ***
## Malty -5.897e-04 3.281e-04 -1.797 0.0724 .
## ABV:Malty 2.142e-04 4.236e-05 5.057 4.4e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4034 on 5552 degrees of freedom
## Multiple R-squared: 0.1706, Adjusted R-squared: 0.1701
## F-statistic: 380.6 on 3 and 5552 DF, p-value: < 2.2e-16
#Creating a tibble
intr.surf.data.m <- tibble(ABV = unlist(expand.grid(seq(0, 100, 1), seq(0, 200, 5))[1]),
Malty = unlist(expand.grid(seq(0, 100, 1), seq(0, 200, 5))[2]))
intr.surf.data.m <- mutate(intr.surf.data.m,
main.hat = predict(flavor_lm.m, intr.surf.data.m),
intr.hat = predict(flavor_lm_inter.m, intr.surf.data.m))
#Visualizing the surfaces
surf.main.m <- ggplot(intr.surf.data.m, aes(ABV, Malty)) +
geom_contour_filled(aes(z = main.hat)) +
labs(subtitle = "Main Effects") +
guides(fill=guide_legend(title="Ratings"))
surf.intr.m <- ggplot(intr.surf.data.m, aes(ABV, Malty)) +
geom_contour_filled(aes(z = intr.hat)) +
labs(subtitle = "Interaction Effects") +
guides(fill=guide_legend(title="Ratings"))
grid.arrange(surf.main.m, surf.intr.m, nrow = 2)
#Visualizing the predictions as constant ABV levels
(effect.m <- filter(intr.surf.data.m, ABV %in% c(2, 3, 4, 12, 13, 17 , 18, 26, 27, 28)) %>%
mutate(ABV = factor(ABV)) %>%
ggplot() +
geom_line(aes(Malty, main.hat, colour = ABV), size = 1) +
geom_line(aes(Malty, intr.hat, colour = ABV), linetype = "dashed", size = 1) +
ylim(3,6) +
ylab("Rating"))
Final Comparison
In order to find whether, on average, a beer receives a higher rating if it has a higher or lower ABV, we looked into the possible correlation between Ratings and ABV.
The test showed evidence of significant data-based multicollinearity between these variables. Specifically, we found a significant (\(p = 0\)) correlation of \(0.4\) between Ratings and ABV. Thus, we used a linear regression to examine whether this relationship is significant in a linear model.
##
## Call:
## lm(formula = rating ~ ABV, data = beer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3418 -0.1463 0.0526 0.2322 1.0088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.297235 0.015351 214.79 <2e-16 ***
## ABV 0.069819 0.002163 32.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4064 on 5554 degrees of freedom
## Multiple R-squared: 0.158, Adjusted R-squared: 0.1578
## F-statistic: 1042 on 1 and 5554 DF, p-value: < 2.2e-16
The model shows that there are 0.07 extra rating points for every increase in ABV. This increase is significantly different from zero, t(5554)=32.28, p<.0001.
We checked how using this linear model would be effective in predicting ratings by comparing against the observed ratings, ie, the residuals. The line that minimizes the square of the residuals has been chosen.
Hence, due to the nature of the positive correlation between rating and ABV, we can conclude by saying that, on average, a beer with higher ABV has higher rating and beer with lower ABV has lower rating. The figure shows the ability of the model to predict beer ratings based on ABV for newer data.
In order to investigate, we created multiple linear regression models, where we looked into:
Regression model with main effects of ABV and Sweet flavor.
Regression model with main effects of ABV and Malty flavor.
Regression model with interaction between ABV and Sweet flavor.
Regression model with interaction between ABV and Malty flavor.
From the main effects model, we have found that the ratings increase by a statistically significant 0.06, t(5553)=25.18, p<.0001, for every extra ABV value, holding the Sweetness measure constant. On the other hand, when controlling for the ABV values, the ratings increased by 0.002 for every extra unit of sweetness, which is significantly different from zero t(5553)=11.83, p<.0001.
Again, the ratings increase by a statistically significant 0.07, t(5553)=30.44, p<0.0001, for every extra ABV value, holding the Maltiness measure constant. On the other hand, when controlling for the ABV values, the ratings increased by 0.001 for every extra unit of maltiness, which is significantly different from zero t(5553)=7.65, p<.0001.
However, our findings indicate that the model including the interaction between the flavors and ABV are significantly better and we will focus our interpretations on that. We used the models to enter multiple predictors to give us the best possible understanding of the effect of the two flavors (Sweet and Malty) with ABV held constant at various levels. The following figure shows us how the ratings will behave accordingly:
In the above figure, the dashed lines show predictions based upon interaction between the interaction model, and the solid lines show predictions based upon the main effects models. In the main effects model, the slopes of the flavors against Rating are always parallel for all different values of ABV. In the interaction models, the slopes of Sweet against Rating are steeper for low values of ABV and shallower for high values of ABV, while the slopes of Malty against Rating are shallower for low values of ABV and shallower for high values of ABV.
From the analysis, we can conclude that:
In order to maximize ratings, the company should use more Malty flavor with beers having high ABV and use more Sweet flavor in beers with low ABV.
As the interaction model including ABV and Malty shows that at each higher ABV levels there is greater positive correlation between ratings and Malty, this flavor should be used if they are creating a high ABV beer.
As the interaction model including ABV and Sweet shows that at each lower ABV levels there is greater positive correlation between ratings and Sweet, this flavor should be used if they are creating a low ABV beer.
This is to certify that the work I am submitting is my own. All external references and sources are clearly acknowledged and identified within the contents. I am aware of the University of Warwick regulation concerning plagiarism and collusion.
No substantial part(s) of the work submitted here has also been submitted by me in other assessments for accredited courses of study, and I acknowledge that if this has been done an appropriate reduction in the mark I might otherwise have received will be made.